* working directory should be set to the folder into which my repository has been extracted

/***********************************************
	preliminaries
***********************************************/

* correcting coordinates for some courts
clear
input str13 city str13 state latitude longitude
"ARLINGTON"		"VA"			38.856464	-77.050461
"PHILADELPHIA"	"PA"			39.950620	-75.155489
"SAN DIEGO"		"CA"			32.718592	-117.166867
"ELIZABETH"		"New Jersey"	40.666285	-74.189695
"OTAY MESA"		"CA"			32.575133	-116.914764
"RENO"			"NEVADA"		39.462365	-119.777628
"SAN ANTONIO"	"TX"			29.423967	-98.498831
"OAKDALE"		""				30.828756	-92.640713
end
tempfile courts
save `courts'

use data/original_article/data/Data/out/courtgps, clear
replace city = strtrim(city)
replace state = strtrim(state)
merge 1:1 city state using `courts', update replace assert(1 5) nogenerate
isid city // to verify we don't need to differentiate by state later
rename (latitude longitude) =2
save `courts', replace

* confirming that CO data for 2001 is actually 2002 data, i.e., the real 2001 data is missing
use data/original_article/data/Data/environment/Pollution/co2001.dta, clear
append using data/original_article/data/Data/environment/Pollution/co2002.dta
duplicates report // all duplicates
codebook date // all 2002


/***********************************************
	find best measurement location for each variable in each city and year
***********************************************/
cap program drop selectstations // do as a program because code snippet repeats for: weather, co, pm, and ozone
program define selectstations
	syntax namelist, station(string) courtfile(string) sourcefile(string) // namelist = what is being measured (temperature etc.); station = where it is measured; courtfile = courts' lat/long; sourcefile = where measurements are stored
	tempfile coverage
	
	* calculate coverage (e.g., days with complete data) for each station-year
	use `namelist' `station' date using `sourcefile', clear
	egen coverage = rowmiss(`namelist')
	assert !mi(coverage)
	recode coverage (0 = 1) (nonmiss = 0) // 1 indicates that station had complete coverage for all measures on that day
	gen year = year(date)
	collapse (count) coverage, by(`station' year)
	sort `station'
	save `coverage'

	* calculate distances of stations from courts and retain only stations at less than 20 miles
	use `station' latitude longitude using `sourcefile', clear
	duplicates drop // reduces to one observation per station
	cross using `courtfile'
	geodist latitude longitude latitude2 longitude2, gen(distance) miles
	bys city: egen mindist = min(dist) // this is only to verify that all or almost all cities HAVE a station within 20 miles -- see next line
	tab city if mindist>20
	keep if distance <= 20 // now we are down to candidate stations within 20 miles
	keep city state `station' distance

	* select for each court and year the station with the most complete coverage within 20 miles (tiebreaker: closer station)
	joinby `station' using `coverage', unmatched(master) // get coverage for each candidate station (which might be a candidate for multiple cities, hence joinby not merge 1:m) in each year
	assert _merge==3 // i.e., all cities were matched to weather/environment coverage
	bys city year: egen maxcoverage = max(coverage)
	keep if float(coverage)==float(maxcoverage)
	bys city year: egen mindist = min(dist)
	keep if float(dist)==float(mindist) // in case there are multiple stations within 20 miles with equal coverage, keep the closer one
	keep city state year `station'
	isid city year // this checks that the last step generated a unique match of station to city for every year
end

selectstations temp6t4 press6t4 dew6t4 prcp6t4 wind6t4, station(wban) courtfile(`courts') sourcefile(data/original_article/reconstructed_data/hourlyweather_vargen_GMTadj_nogaps_HS.dta) // no stations w/in 20mi: Oakdale LA (623 cases in asylum.dta), Laredo TX (0; or 123 if using location instead of city). Also, Eloy's WBAN 3914 only comes online in 2002, so we lose approximately 400 more cases
tempfile weatherstations
save `weatherstations'

foreach feature in ozone co pm {
	tempfile `feature' `feature'stations
	
	* prepare the actual data by combining yearly files into one
	use data/original_article/data/Data/environment/Pollution/`feature'2000.dta, clear
	forvalues year=2001/2004 {
		append using data/original_article/data/Data/environment/Pollution/`feature'`year'.dta
		}
	duplicates drop
	collapse `feature'*, by(latit longit date) // there are several thousand date-location combinations with more than one measurement of the same variable, e.g., ozone. Also, the 2001 co file contains 2002 values
	egen stationid = group(latit longit) // can't use cityname because there are often multiple observations per city and date: e.g., in co2003, there are 321 cities, but 492 latitudes and longitudes
	isid stationid date
	save ``feature''
	
	* get the stations
	if "`feature'" == "pm" local var "pm25"
	else local var "`feature'"
	selectstations `var', station(stationid) courtfile(`courts') sourcefile(``feature'')
	save ``feature'stations'
}


/***********************************************************
	merge into asylum data
************************************************************/
use data/original_article/data/Data/raw/asylum.dta, clear
gen year = year(date)
merge m:1 city year using `weatherstations', keep(1 3) nogenerate
merge m:1 wban date using data/original_article/reconstructed_data/hourlyweather_vargen_GMTadj_nogaps_HS.dta, keep(1 3) nogenerate
foreach feature in ozone co pm {
	merge m:1 city year using ``feature'stations', keep(1 3) nogenerate
	merge m:1 stationid date using ``feature'', keep(1 3) nogenerate
}

merge m:1 city date using data/original_article/data/Data/Environment/Weather/skydistance.dta, keep(1 3) keepusing(dsky skycover)
sum dsky
replace skycover=. if dsky>20*1.60934 // assuming dsky is recorded in km, i.e., erring on the side of fewer missing measurements
count if dsky<. & dsky>20 // what difference would it make if we assumed dsky is recorded in miles? --> would lose 39k obs.
drop dsky _merge

sum co if year!=2001, meanonly
replace co=r(mean) if year==2001 // cf. beginning of this script: 2001 data isn't actually 2001 data but 2002 data, so I replace it by its mean

save data/original_article/reconstructed_data/matched_HS_improved_corrections_stablestations.dta, replace
